%%capture
import sys
!{sys.executable} -m pip install -U pandas-profiling[notebook]
!jupyter nbextension enable --py widgetsnbextension
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt
from pandas_profiling import ProfileReport
import plotly.express as px
# Resize plots
plt.rcParams['figure.figsize'] = [22, 20]
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
for filename in filenames:
print(os.path.join(dirname, filename))
# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session
/kaggle/input/students-performance-in-exams/StudentsPerformance.csv
df = pd.read_csv('/kaggle/input/students-performance-in-exams/StudentsPerformance.csv')
df.head()
| gender | race/ethnicity | parental level of education | lunch | test preparation course | math score | reading score | writing score | |
|---|---|---|---|---|---|---|---|---|
| 0 | female | group B | bachelor's degree | standard | none | 72 | 72 | 74 |
| 1 | female | group C | some college | standard | completed | 69 | 90 | 88 |
| 2 | female | group B | master's degree | standard | none | 90 | 95 | 93 |
| 3 | male | group A | associate's degree | free/reduced | none | 47 | 57 | 44 |
| 4 | male | group C | some college | standard | none | 76 | 78 | 75 |
df.dtypes
gender object race/ethnicity object parental level of education object lunch object test preparation course object math score int64 reading score int64 writing score int64 dtype: object
profile = ProfileReport(df, title='Scores profiling report', explorative=True)
profile.to_widgets()
# Identifying if the dataset has null values
df.isnull().sum()
# Looks like our dataset has all rows duly filled.
gender 0 race/ethnicity 0 parental level of education 0 lunch 0 test preparation course 0 math score 0 reading score 0 writing score 0 dtype: int64
fig = px.box(df, y="math score")
fig.show()
fig = px.histogram(df, x="math score")
fig.show()
# Distribution for reading score
fig = px.box(df, y="reading score")
fig.show()
fig = px.histogram(df, x="reading score")
fig.show()
# Distribution for writing score
fig = px.box(df, y="writing score")
fig.show()
fig = px.histogram(df, x="writing score")
fig.show()
With the df.describe() method and the bloxplots here plotted one can see that the distribution is very similar to each matter.
df.corr().style.background_gradient(cmap="Blues")
| math score | reading score | writing score | |
|---|---|---|---|
| math score | 1.000000 | 0.817580 | 0.802642 |
| reading score | 0.817580 | 1.000000 | 0.954598 |
| writing score | 0.802642 | 0.954598 | 1.000000 |
We can see that there is a good positive correlation among the variables.
df2 = df['gender'].value_counts('gender')
print(df2)
female 0.518 male 0.482 Name: gender, dtype: float64
# Inserting the values above in a python dictionary.
proportion = {}
proportion['female'] = df[df['gender']=='female']['gender'].count()
proportion['male'] = df[df['gender']=='male']['gender'].count()
print(proportion)
{'female': 518, 'male': 482}
# Data to plot
labels = []
sizes = []
for x, y in proportion.items():
labels.append(x)
sizes.append(y)
labels = [x.upper() for x in labels]
# Plot
fig = px.bar(x=labels, y=sizes)
fig.show()
One can see that the proportion is very equal, almost 50/50.
df['parental level of education'].value_counts('parental level of education')
some college 0.226 associate's degree 0.222 high school 0.196 some high school 0.179 bachelor's degree 0.118 master's degree 0.059 Name: parental level of education, dtype: float64
df['lunch'].value_counts('lunch')
standard 0.645 free/reduced 0.355 Name: lunch, dtype: float64
One can see that most students opt for standart lunch.
df['race/ethnicity'].value_counts('race/ethnicity')
group C 0.319 group D 0.262 group B 0.190 group E 0.140 group A 0.089 Name: race/ethnicity, dtype: float64
# Inserting the values above in a python dictionary.
race2 = {}
groups = ['A', 'B', 'C', 'D', 'E']
for i in groups:
race2[f'group_{i}'] = df[df['race/ethnicity']==f'group {i}']['race/ethnicity'].count()
print(race2)
{'group_A': 89, 'group_B': 190, 'group_C': 319, 'group_D': 262, 'group_E': 140}
races = list(race2.keys())
values = list(race2.values())
# Plot
fig = px.bar(x=races, y=values)
fig.show()